import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.nonparametric.smoothers_lowess import lowess
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn import ensemble
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
import itertools
import graphviz
import copy
pd.options.display.max_columns = None
nfl = pd.read_csv('train.csv')
nfl.head()
# Query only data from Runners
runners = nfl.query('NflId == NflIdRusher')
runners = runners.reset_index(drop=True)
runners_featured = pd.DataFrame()
# Convert Runners Home/Away to Binary (home=1, away=0)
runners_featured['Team_Home'] = runners.Team.map(dict(home=1, away=0))
runners_featured['Dis'] = runners['Dis']
# Refactor Runner names to be a quantitative value (yards/game)
runners_df = pd.DataFrame({'DisplayName':runners['DisplayName'].unique()})
for player in runners['DisplayName'].unique():
id = runners_df.index[runners_df['DisplayName']==player]
strng = player
runners_df.loc[id, 'sum_yards'] = runners.query("DisplayName == @strng")['Yards'].sum()
runners_df.loc[id, 'games'] = runners.query("DisplayName == @strng")['GameId'].nunique()
# Calculate ypg for each runner
runners_df['ypg'] = runners_df.apply(lambda x: x['sum_yards']/x['games'] ,axis=1)
# Merge the runners df with the running plays dataframe to define Player using their ypg
runners = runners.merge(runners_df,'left',on='DisplayName')
runners.head()
runners_featured['ypg'] = runners['ypg']
# Determine Yard line (0: own end line, 100: touchdown line)
runners['YardLine_refactor'] = list(runners.apply(lambda x: x['YardLine'] if (x['PossessionTeam'] == x['FieldPosition']) else 100 - x['YardLine'] , axis=1))
runners_featured['YardLine_refactor'] = list(runners['YardLine_refactor'])
# Calculate Time Elapsed in seconds
runners['GameClock_timedata'] = np.array(pd.to_timedelta('00:' + runners['GameClock'].astype(str).str[:-3]))
runners['TimeElapsed'] = list(runners.apply(lambda x: (x['Quarter']-1)*pd.Timedelta(minutes=15) + pd.Timedelta(minutes=15) - x['GameClock_timedata'] , axis=1))
runners['TimeElapsed'] = runners['TimeElapsed'].dt.seconds
runners_featured['TimeElapsed'] = runners['TimeElapsed']
# Refactor Possession Team and Defense Team based on total running yards allowed
runners['DefenseTeam'] = list(runners.apply(lambda x: x['HomeTeamAbbr'] if (x['PossessionTeam'] == x['VisitorTeamAbbr']) else x['VisitorTeamAbbr'] , axis=1))
# Gather Total Rushing Data for each team
team_rushing_df = pd.DataFrame({'PossessionTeam':runners['PossessionTeam'].unique()})
for team in runners['PossessionTeam'].unique():
id = team_rushing_df.index[team_rushing_df['PossessionTeam']==team]
strng = team
team_rushing_df.loc[id, 'team_rushing_yards'] = runners.query("PossessionTeam == @strng")['Yards'].sum()
# Gather Total Defensive Rushing yards allowed for each team
team_defense_df = pd.DataFrame({'DefenseTeam':runners['DefenseTeam'].unique()})
for team in runners['DefenseTeam'].unique():
id = team_defense_df.index[team_defense_df['DefenseTeam']==team]
strng = team
team_defense_df.loc[id, 'team_defense_rushing_yards'] = runners.query("DefenseTeam == @strng")['Yards'].sum()
# Merge the team total rushing df and defense df with the running plays dataframe to define Possession Team and Defensive teams using their rushing yards and rushing yards allowed
runners = runners.merge(team_rushing_df,'left',on='PossessionTeam')
runners = runners.merge(team_defense_df,'left',on='DefenseTeam')
runners_featured['team_rushing_yards'] = runners['team_rushing_yards']
runners_featured['team_defense_rushing_yards'] = runners['team_defense_rushing_yards']
# Down and Distance
# TODO: probably should consider combining these
runners_featured['Down'] = runners['Down']
runners_featured['Distance'] = runners['Distance']
# Convert Home and Away Score to Offense/ Defense Score
runners['PossessionScore'] = list(runners.apply(lambda x: x['HomeScoreBeforePlay'] if (x['Team'] == 'home') else x['VisitorScoreBeforePlay'] , axis=1))
runners['DefenseScore'] = list(runners.apply(lambda x: x['HomeScoreBeforePlay'] if (x['Team'] == 'away') else x['VisitorScoreBeforePlay'] , axis=1))
runners['ScoreDifferential'] = runners['PossessionScore'] - runners['DefenseScore']
# TODO: look into combine with time remaining
runners_featured['PossessionScore'] = runners['PossessionScore']
runners_featured['DefenseScore'] = runners['DefenseScore']
runners_featured['ScoreDifferential'] = runners['ScoreDifferential']
# Offense Personel
# OL: if nan assumed to be 5 (typical personel)
runners['OL'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sOL+')
runners['OL'] = runners['OL'].fillna(5)
runners['TE'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sTE+')
runners['WR'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sWR+')
runners['RB'] = runners['OffensePersonnel'].str.extract(r'.*([0-9])\sRB+')
runners_featured['OL'] = runners['OL']
runners_featured['TE'] = runners['TE']
runners_featured['WR'] = runners['WR']
runners_featured['RB'] = runners['RB']
# Offense Formation
# Determine if Shotgun/Pistol, Wildcat or under center
runners['Shotgun'] = list(runners.apply(lambda x: 1 if (x['OffenseFormation'] == 'SHOTGUN' or x['OffenseFormation'] == 'PISTOL') else 0, axis=1))
runners['Wildcat'] = list(runners.apply(lambda x: 1 if (x['OffenseFormation'] == 'WILDCAT') else 0, axis=1))
runners_featured['Shotgun'] = runners['Shotgun']
runners_featured['Wildcat'] = runners['Wildcat']
# Defense Personel
runners['DL'] = runners['DefensePersonnel'].str.extract(r'.*([0-9])\sDL+')
runners['LB'] = runners['DefensePersonnel'].str.extract(r'.*([0-9])\sLB+')
runners['DB'] = runners['DefensePersonnel'].str.extract(r'.*([0-9])\sDB+')
runners_featured['DL'] = runners['DL']
runners_featured['LB'] = runners['LB']
runners_featured['DB'] = runners['DB']
# Defenders in the box
# One NaN value re-factored to be DL + LB
runners.DefendersInTheBox.fillna(runners.DL + runners.LB, inplace=True)
runners_featured['DefendersInTheBox'] = runners['DefendersInTheBox']
# Play Direction
runners['PlayDirection_Left'] = runners.PlayDirection.map(dict(left=1, right=0))
runners_featured['PlayDirection_Left'] = runners['PlayDirection_Left']
# Runner Height
runners['PlayerHeight_inches'] = np.float_(runners['PlayerHeight'].str.extract(r'(\d+)-(\d+)')[0]) * 12 + np.float_(runners['PlayerHeight'].str.extract(r'(\d+)-(\d+)')[1])
runners_featured['PlayerHeight_inches'] = runners['PlayerHeight_inches']
# Runner Weight
runners_featured['PlayerWeight'] = runners['PlayerWeight']
# Runner Age: time at snap - birth date
runners['Age_days'] = pd.to_datetime(runners['TimeSnap'].astype(str).str[:10]) - pd.to_datetime(runners['PlayerBirthDate'])
runners['Age_days'] = runners['Age_days'].dt.days
runners_featured['Age_days'] = runners['Age_days']
# Position
runners['Runner_RB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'RB' else 0, axis=1))
runners['Runner_WR'] = list(runners.apply(lambda x: 1 if x['Position'] == 'WR' else 0, axis=1))
runners['Runner_FB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'FB' else 0, axis=1))
runners['Runner_HB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'HB' else 0, axis=1))
runners['Runner_QB'] = list(runners.apply(lambda x: 1 if x['Position'] == 'QB' else 0, axis=1))
runners_featured['Runner_RB'] = runners['Runner_RB']
runners_featured['Runner_WR'] = runners['Runner_WR']
runners_featured['Runner_FB'] = runners['Runner_FB']
runners_featured['Runner_HB'] = runners['Runner_HB']
runners_featured['Runner_QB'] = runners['Runner_QB']
# Week
runners_featured['Week'] = runners['Week']
# Stadium
outdoors_names = ['Outdoor', 'Outdoors', 'Outddors', 'Oudoor', 'Ourdoor', 'Heinz Field', 'Outdor', 'Cloudy', 'Bowl', 'Outside', 'OUTDOOR']
indoor_names = ['Indoors', 'Indoor', 'Retr. Roof-Closed', 'Retr. Roof - Closed','Dome', 'Domed, closed','Indoor, Roof Closed', 'Retr. Roof Closed','Closed Dome','Dome, closed','Domed', 'Indoor','indoor','Retractable Roof - Closed']
retractable_open_names = ['Retractable Roof', 'Open', 'Indoor, Open Roof','Outdoor Retr Roof-Open','Retr. Roof-Open','Domed, Open', 'Domed, open','Indoor, roof open',]
runners['Outdoors'] = list(runners.apply(lambda x: 1 if x['StadiumType'] in outdoors_names else 0, axis=1))
runners['Indoors'] = list(runners.apply(lambda x: 1 if x['StadiumType'] in indoor_names else 0, axis=1))
runners['Retractable_open'] = list(runners.apply(lambda x: 1 if x['StadiumType'] in retractable_open_names else 0, axis=1))
runners_featured['Outdoors'] = runners['Outdoors']
runners_featured['Indoors'] = runners['Indoors']
runners_featured['Retractable_open'] = runners['Retractable_open']
# Turf
grass_names = ['Grass', 'Natural Grass','Natural grass', 'grass', 'Natural','Naturall Grass','natural grass']
runners['Natural_Grass'] = list(runners.apply(lambda x: 1 if x['Turf'] in grass_names else 0, axis=1))
runners_featured['Natural_Grass'] = runners['Natural_Grass']
# Weather: rainy/snowy weather and cold weather
rainy_weather = ['Light Rain', 'Showers','Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
'Rain','Cloudy, fog started developing in 2nd quarter',
'Rain likely, temps in low 40s.','Cloudy, light snow accumulating 1-3"','Heavy lake effect snow', 'Snow','Scattered Showers',
'Cloudy, 50% change of rain','Cloudy, Rain', 'Rain shower','Rainy','Light rain','Cloudy with showers and wind','Raining', 'Rain and Wind']
runners['rainy_weather'] = list(runners.apply(lambda x: 1 if x['GameWeather'] in rainy_weather else 0, axis=1))
runners_featured['rainy_weather'] = runners['rainy_weather']
# Refactor NaNs in temperature and humidity to be stadiums average temp/ humidity
stadium_df = pd.DataFrame({'Stadium':runners[runners['Temperature'].isnull()]['Stadium'].unique()})
for stadium in runners[runners['Temperature'].isnull()]['Stadium'].unique():
id = stadium_df.index[stadium_df['Stadium']==stadium]
strng = stadium
stadium_df.loc[id, 'average_temperature'] = runners.query("Stadium == @strng")['Temperature'].mean()
stadium_df.loc[id, 'average_humidity'] = runners.query("Stadium == @strng")['Humidity'].mean()
runners = runners.merge(stadium_df,'left',on='Stadium')
runners.Temperature.fillna(runners.average_temperature, inplace=True)
runners.Humidity.fillna(runners.average_humidity, inplace=True)
runners_featured['Humidity'] = runners['Humidity']
runners_featured['Temperature'] = runners['Temperature']
# runners_featured['WindSpeed'] = runners['WindSpeed']
bool_cols = [col for col in runners_featured
if np.isin(runners_featured[col].dropna().unique(), [0, 1]).all()]
continuous_cols = [col for col in runners_featured
if not np.isin(runners_featured[col].dropna().unique(), [0, 1]).all()]
runners_featured_bool = runners_featured[bool_cols]
runners_featured_cont = runners_featured[continuous_cols]
scaler = StandardScaler()
scaler.fit(runners_featured_cont)
runners_featured_cont_scaled = scaler.transform(runners_featured_cont)
runners_featured_cont_scaled_df = pd.DataFrame(runners_featured_cont_scaled, index=runners_featured_cont.index, columns=runners_featured_cont.columns)
runners_scaled_df = runners_featured_bool.merge(runners_featured_cont_scaled_df, left_index=True, right_index=True)
fig = px.scatter_matrix(runners_featured.iloc[:,:10])
fig.show()
fig_4 = px.histogram( runners[['Yards']])
fig_4.show()
This histogram of all the different rushing yards shows that it is somewhat right skewed and we note that in the models and when running diagnostics.
runners_scaled_response_df = runners_scaled_df.merge(runners['Yards'], left_index=True, right_index=True)
correlation_cols = runners_scaled_response_df.columns
corr = runners_scaled_response_df[correlation_cols].corr()
corr.style.background_gradient(cmap='coolwarm')
yards_df = corr[['Yards']].reindex(corr[['Yards']].Yards.abs().sort_values(ascending=False).index).drop(['Yards'])
yards_df
This correlation matrix is showing some of the collinearity issues that may arise due to the predictors being correlated. A few that stand out are Player Height and Player Weight, perhaps only one of these is needed in the model. Other noticable relationships are the offensive (OL, TE, RB and WR) position groups all have relatively high correlation to each other. This makes sense because if there are more in one group there are less than others (since each team always has 11 players split between the groups). There is a similar correlation between the defensive position groups (DL, LB and CB). Also, DB is highly negatively correlated with Defenders in the box (the box is an area near the line of scrimmage on the defensive side of the ball). This makes sense because typically DB's are outside of this box, perhaps removing DB as a predictor and using defenders in the box will improve collinearity.
Also, there is a high correlation between score differential and the offesnive teams score as well as a negative correlation between defense teams score and score differential. This makes sense because score differential was calculated from offense and defense score. These are all predictors that could be changed to remove some of this collinearity. Finally, both offense and defense scores are highly correlated to time elapsed, which makes sense because score can only increase as time elapsed increases.
There were other interesting interactions in the predictors as well. For example rushing yards per game seems to be positively correlated with player weight. Also a teams rushing yards allowed (on defense) is slightly negatively correlated to the number of Defensive Linemen. Another, Week number (from week 1 at the beginning of the season to 17 at the end of the season) is highly negatively correlated with temperature. This makes sense as the season starts in August and ends in December.
Before making any changes to the predictors, a Variation Inflation Factor method will
VIF = pd.DataFrame([variance_inflation_factor(runners_scaled_df.values, i)
for i in range(runners_scaled_df.shape[1])],
index=runners_scaled_df.columns, columns=['VIF'])
VIF
remove_cols = ['Runner_RB','Dis', 'Indoors','Natural_Grass','ScoreDifferential','RB','DB','PlayerHeight_inches','Week']
runners_scaled_response_drop_df = runners_scaled_response_df.drop(remove_cols, axis=1)
runners_scaled_drop_df = runners_scaled_df.drop(remove_cols, axis=1)
correlation_cols = runners_scaled_response_drop_df.columns
corr = runners_scaled_response_drop_df[correlation_cols].corr()
corr.style.background_gradient(cmap='coolwarm')
yards_df = corr[['Yards']].reindex(corr[['Yards']].Yards.abs().sort_values(ascending=False).index).drop(['Yards'])
yards_df
VIF = pd.DataFrame([variance_inflation_factor(runners_scaled_drop_df.values, i)
for i in range(runners_scaled_drop_df.shape[1])],
index=runners_scaled_drop_df.columns, columns=['VIF'])
VIF
X = runners_scaled_drop_df
X = X.astype(float)
y = runners[['Yards']]
y = y.astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_train_regression = runners[['Yards']].iloc[X_train.index]
y_test_regression = runners[['Yards']].iloc[X_test.index]
A Multiple Linear Regression will now be calculated on full data set to get a summary of statistics from the ordinary least squares analysis. This will give more insight into which predictors in particular are important to this regression
X_sm = X
X_sm = sm.add_constant(X_sm)
model = sm.OLS(y,X_sm).fit()
model.summary()
model.pvalues[model.pvalues < 0.05]
From this ordinary least squares analysis, the r^2 value is extremely low. This makes sense because it is such a complex problem with many 'human factors' involved. However, as the saying goes, Football is a 'game of inches' meaning that any insight at all, however low the r^2 value could be valuable for teams gaining the upper hand in strategy on their opponents.
The p values show that a few variable have more statistical significance than others, including if the runner is a Wide Receiver, the yards per game for the runner, the field position of the offense, the distance to first down and others.
residuals = model.resid
fitted = model.fittedvalues
smoothed = lowess(residuals,fitted)
fig, ax = plt.subplots()
ax.scatter(fitted, residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Residuals')
ax.set_xlabel('Fitted Values')
ax.set_title('Residuals vs. Fitted')
ax.plot([min(fitted),max(fitted)],[0,0],color = 'k',linestyle = ':', alpha = .3)
This Residual plot shows many values with very high residuals and isn't evenly scattered, so perhaps regression isn't the best choice for this complex problem.
sorted_student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
sorted_student_residuals.index = model.resid.index
sorted_student_residuals = sorted_student_residuals.sort_values(ascending = True)
df = pd.DataFrame(sorted_student_residuals)
df.columns = ['sorted_student_residuals']
df['theoretical_quantiles'] = stats.probplot(df['sorted_student_residuals'], dist = 'norm', fit = False)[0]
rankings = abs(df['sorted_student_residuals']).sort_values(ascending = False)
fig, ax = plt.subplots()
x = df['theoretical_quantiles']
y = df['sorted_student_residuals']
ax.scatter(x,y, edgecolor = 'k',facecolor = 'none')
ax.set_title('Normal Q-Q')
ax.set_ylabel('Standardized Residuals')
ax.set_xlabel('Theoretical Quantiles')
ax.plot([np.min([x,y]),np.max([x,y])],[np.min([x,y]),np.max([x,y])], color = 'r', ls = '--')
This plot confirms that the distribution of yards is right skewed, with many large outliers.
student_residuals = model.get_influence().resid_studentized_internal
sqrt_student_residuals = pd.Series(np.sqrt(np.abs(student_residuals)))
sqrt_student_residuals.index = model.resid.index
smoothed = lowess(sqrt_student_residuals,fitted)
fig, ax = plt.subplots()
ax.scatter(fitted, sqrt_student_residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('$\sqrt{|Studentized \ Residuals|}$')
ax.set_xlabel('Fitted Values')
ax.set_title('Scale-Location')
ax.set_ylim(0,max(sqrt_student_residuals)+0.1)
This plot shows that there is one outlier, and the residuals aren't completely random across the range of predictors.residuals spread wider along the axis.
student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
student_residuals.index = model.resid.index
df = pd.DataFrame(student_residuals)
df.columns = ['student_residuals']
df['leverage'] = model.get_influence().hat_matrix_diag
smoothed = lowess(df['student_residuals'],df['leverage'])
sorted_student_residuals = abs(df['student_residuals']).sort_values(ascending = False)
fig, ax = plt.subplots()
x = df['leverage']
y = df['student_residuals']
xpos = max(x)+max(x)*0.01
ax.scatter(x, y, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Studentized Residuals')
ax.set_xlabel('Leverage')
ax.set_title('Residuals vs. Leverage')
ax.set_ylim(min(y)-min(y)*0.15,max(y)+max(y)*0.15)
ax.set_xlim(-0.01,max(x)+max(x)*0.05)
plt.tight_layout()
cooksx = np.linspace(min(x), xpos, 50)
p = len(model.params)
poscooks1y = np.sqrt((p*(1-cooksx))/cooksx)
poscooks05y = np.sqrt(0.5*(p*(1-cooksx))/cooksx)
negcooks1y = -np.sqrt((p*(1-cooksx))/cooksx)
negcooks05y = -np.sqrt(0.5*(p*(1-cooksx))/cooksx)
ax.plot(cooksx,poscooks1y,label = "Cook's Distance", ls = ':', color = 'r')
ax.plot(cooksx,poscooks05y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks1y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks05y, ls = ':', color = 'r')
ax.plot([0,0],ax.get_ylim(), ls=":", alpha = .3, color = 'k')
ax.plot(ax.get_xlim(), [0,0], ls=":", alpha = .3, color = 'k')
ax.annotate('1.0', xy = (xpos, poscooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, poscooks05y[-1]), color = 'r')
ax.annotate('1.0', xy = (xpos, negcooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, negcooks05y[-1]), color = 'r')
ax.legend()
plt.show()
This plot shows that all cases are inside Cook's Distance lines, so there are no high leverage points.
These linear regression diagnostic plots show that perhaps linear regression is not the best solution to this complex problem. Since the data is right skewed, with many outliers of high rushing yards, perhaps some featuring could be done to remove these data points or refactor them with some transformation. For the sake of comparing models, linear regression will still be used but with the knowledge that some of the issues that may arise come from these outliers and non-linearity that comes from the complexity of the problem.
poly = PolynomialFeatures(2, interaction_only=True, include_bias=False)
x_interaction = poly.fit_transform(X_sm)
interaction_df = pd.DataFrame(x_interaction, index=X_sm.index, columns=poly.get_feature_names(X_sm.columns))
model = sm.OLS(y,interaction_df).fit()
model.summary()
interaction_summary = pd.DataFrame((model.summary().tables[1]))
columns = list(interaction_summary.iloc[0].astype(str))
interaction_summary.columns = columns
interaction_summary = interaction_summary[1:]
interaction_summary.index = list(interaction_summary.iloc[0:,0].astype(str))
interaction_summary = interaction_summary.drop(columns=[''])
interaction_summary = interaction_summary.rename(columns={'P>|t|': "p"})
interaction_summary = interaction_summary.astype(str).astype(float)
interaction_summary
interaction_summary.sort_values(by=['p']).head(29)
model.pvalues[model.pvalues < 0.05]
In this analysis of interaction (synergy) effects from the predictors many were found to have interacting effects. Some of these include the 'Wildcat' formation interacting with the predictor for if the QB is the runner, the runners ypg and the down. Other interacting variables are the Runner_WR (indicating the runner is a WR) with ypg, yard line, down, distance, number of TE's and WR's in the formation and yards per game. Another interaction is PossessionScore (offense score) and DefenseScore.
regression_models = []
regression_train_MSE = []
regression_test_MSE = []
The null model is defined as the average rushing yards over the entire data set.
y_train['Yards'].mean()
null_train_predictions = [y_train['Yards'].mean()] * len(X_train.index)
null_test_predictions = [y_train['Yards'].mean()] * len(X_test.index)
train_MSE = mean_squared_error(y_train,null_train_predictions)
test_MSE = mean_squared_error(y_test, null_test_predictions)
regression_models.append('Null')
regression_train_MSE.append(train_MSE)
regression_test_MSE.append(test_MSE)
train_MSE
test_MSE
The Null Model gives a baseline test MSE of 38.09.
regression_tree_clf = DecisionTreeRegressor(max_leaf_nodes=10)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train)
dot_data = tree.export_graphviz(regression_tree_clf, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
The Full Decision tree shows that the three most important predictors for the rushing yards regression decision tree are field position (YardLine_refactor), defenders in the box and the rushers yards per game. A the field position goes up (which means that the team is getting closer to their oponents goal line, ie moving in the positive direction up the field), the number of yards a running play will be expected to gain goes down. This could be because as they get closer to the goal line there are just less yards left available to gain as well as the defenses may change strategy with less room to give. It also shows that the more defenders in the box, the less rushing yards can be expected. The 'box' is a rectangle that is the width of the offensive line and about 5 yards deep. These are essentially the number of defenders close to the ball at the snap. Yards per game is a measurement of the rushers total yards gained consistently over every game they run the ball in. This means that more consistent rushers have a higher ypg and according to this decision tree are more likely to gain more yards.
null_train_predictions = train_predictions = regression_tree_clf.predict(X_train)
null_test_predictions = test_predictions = regression_tree_clf.predict(X_test)
train_MSE = mean_squared_error(y_train,null_train_predictions)
test_MSE = mean_squared_error(y_test, null_test_predictions)
train_MSE
test_MSE
regression_models.append('Decision_Tree')
regression_train_MSE.append(train_MSE)
regression_test_MSE.append(test_MSE)
regression_models_df = pd.DataFrame({'regression_models': regression_models,'regression_train_MSE': regression_train_MSE, 'regression_test_MSE':regression_test_MSE})
regression_models_df
First the tree will be pruned by running cross validation on each tree size and determining the best model out of all the created tree sizes.
path = regression_tree_clf.cost_complexity_pruning_path(X_train,y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas
tree_size = range(2,11)
def run_cross_validation_on_regression_trees(X, y, tree_size, cv=5, scoring='neg_mean_squared_error'):
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
accuracy_scores = []
for size in tree_size:
tree_model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=size)
cv_scores = cross_val_score(tree_model, X, y, cv=cv, scoring=scoring)
cv_scores_list.append(cv_scores)
cv_scores_mean.append(cv_scores.mean())
cv_scores_std.append(cv_scores.std())
accuracy_scores.append(mean_squared_error(y, tree_model.fit(X, y).predict(X)))
cv_scores_mean = - np.array(cv_scores_mean)
cv_scores_std = np.array(cv_scores_std)
accuracy_scores = np.array(accuracy_scores)
return cv_scores_mean, cv_scores_std, accuracy_scores
def plot_cross_validation_on_regression_trees(tree_size, cv_scores_mean, cv_scores_std, accuracy_scores, title):
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(tree_size, cv_scores_mean, '-o', label='mean cross-validation accuracy', alpha=0.9)
# ax.fill_between(ccp_alphas, cv_scores_mean-2*cv_scores_std, cv_scores_mean+2*cv_scores_std, alpha=0.2)
# ylim = plt.ylim()
ax.plot(tree_size, accuracy_scores, '-*', label='train accuracy', alpha=0.9)
ax.set_title(title, fontsize=16)
ax.set_xlabel('Terminal Nodes', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
# ax.set_ylim(ylim)
# ax.set_xticks(ccp_alphas)
ax.legend()
sm_cv_scores_mean
sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores = run_cross_validation_on_regression_trees(X_train, y_train, tree_size)
plot_cross_validation_on_regression_trees(tree_size, sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores, 'MSE vs. Terminal Nodes on Cross-Validation training data')
From the graph above, it appears that a tree with 5 terminal nodes has the lowest error and is simpler than the full tree determined earlier. This model with 5 Terminal Nodes will be used moving forward for the pruned tree.
regression_tree_clf = DecisionTreeRegressor(max_leaf_nodes=5)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train)
dot_data = tree.export_graphviz(regression_tree_clf, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
train_predictions = regression_tree_clf.predict(X_train)
test_predictions = regression_tree_clf.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
regression_models.append('Decision_Tree')
regression_train_MSE.append(train_MSE)
regression_test_MSE.append(test_MSE)
regression_models_df = regression_models_df.append({'regression_models':'Pruned_Tree', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
From these two tree models some interesting information about the predictors can be visualized. First, clearly field position (YardLine_refactor) is a key component to determining rushing yards, followed by the number of defenders in the box. These two, along with the runners yards per game and the binary predictor for if the runner is a wide receiver round out the pruned decision tree. These are 4 variables that should be kept track of as further models are analyzed
Now the Tree will be pruned based on cost complexity pruning, using cross validation to obtain an alpha (tuning parameter)value on the training set.
regression_tree_clf = DecisionTreeRegressor(max_leaf_nodes=10)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train_regression)
path = regression_tree_clf.cost_complexity_pruning_path(X_train, y_train_regression)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
alphas = np.linspace(0,max(ccp_alphas[:-1]),20)
alphas
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for alpha in alphas:
tree_model = DecisionTreeRegressor(random_state=0, max_leaf_nodes=10,ccp_alpha=alpha)
cv_scores = cross_val_score(tree_model, X_train, y_train_regression.values.ravel(), cv=5, scoring='neg_mean_squared_error')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(-cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_MSE_list.append(mean_squared_error(y_train_regression,tree_model.fit(X_train, y_train_regression.values.ravel()).predict(X_train)))
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Accuracy vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter alpha', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
alphas[8]
From the Cross validation on Tuning parameter, a tuning parameter of 0.058 will be used for the Cost Complexity Pruning Regression Tree Model
regression_tree_clf = DecisionTreeRegressor(random_state=0,ccp_alpha=0.058, max_leaf_nodes=10)
regression_tree_clf = regression_tree_clf.fit(X_train, y_train_regression)
train_predictions = regression_tree_clf.predict(X_train)
test_predictions = regression_tree_clf.predict(X_test)
train_MSE = mean_squared_error(y_train_regression,train_predictions)
test_MSE = mean_squared_error(y_test_regression, test_predictions)
train_MSE
test_MSE
dot_data = tree.export_graphviz(regression_tree_clf, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
backup = copy.deepcopy(regression_models_df)
data = pd.DataFrame({'regression_models':'Cost_Complexity_Pruned_Tree', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},index=[2.5])
regression_models_df = regression_models_df.append(data, ignore_index=False)
regression_models_df = regression_models_df.sort_index().reset_index(drop=True)
regression_models_df
The Cost complexity Pruning approach ended up obtaining the same results as the full tree for this data set.
clf_bagging = RandomForestRegressor(max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = len(X_train.columns))
clf_bagging = clf_bagging.fit(X_train,y_train.values.ravel())
oob_predictions = clf_bagging.oob_prediction_
oob_MSE = mean_squared_error(y_train,oob_predictions)
oob_MSE
train_predictions = clf_bagging.predict(X_train)
test_predictions = clf_bagging.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
Using a Bagging approach gives an OOB error on the training data of 41.04 but a test MSE of 37.28, which is slightly less than the pruned decision tree.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Bagging', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
clf_bagging.feature_importances_
tree_importance_sorted_idx = np.argsort(clf_bagging.feature_importances_)
tree_indices = np.arange(0, len(clf_bagging.feature_importances_)) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_bagging.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_bagging.feature_importances_)))
ax1.set_title('Bagging Variable Importance')
The Bagging predictor importance chart shows similar predictor information to the previously calculated trees. Field Position (YardLine_refactor), rushers yards per game, defenders in the box, and if the rusher is a wide receiver are the most important predictors according to this bagging model.
len(X_train.columns)
np.sqrt(len(X_train.columns))
num_predictors = range(1,len(X_train.columns)+1)
oob_MSE_list = []
train_MSE_list = []
for num in num_predictors:
clf_rf = RandomForestRegressor(max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = num)
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
oob_predictions = clf_rf.oob_prediction_
oob_MSE = mean_squared_error(y_train,oob_predictions)
oob_MSE_list.append(oob_MSE)
train_predictions = clf_rf.predict(X_train)
train_MSE = mean_squared_error(y_train,train_predictions)
train_MSE_list.append(train_MSE)
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(num_predictors, oob_MSE_list, '-o', label='OOB MSE', alpha=0.9)
ax.plot(num_predictors, train_MSE_list, '-*', label='train MSE', alpha=0.9)
ax.set_title('Random Forest: MSE vs. Number of Predictors', fontsize=16)
ax.set_xlabel('Number of Predictors', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
From the plot above with MSE vs. Number of Predictors, it appears the OOB MSE appears to level off around 18 predictors.
number_trees = np.arange(25,525,25)
number_trees = np.insert(number_trees, 0,1)
oob_MSE_list = []
train_MSE_list = []
for num in number_trees:
clf_rf = RandomForestRegressor(n_estimators=num, max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = 18)
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
oob_predictions = clf_rf.oob_prediction_
oob_MSE = mean_squared_error(y_train,oob_predictions)
oob_MSE_list.append(oob_MSE)
train_predictions = clf_rf.predict(X_train)
train_MSE = mean_squared_error(y_train,train_predictions)
train_MSE_list.append(train_MSE)
print(num)
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees, oob_MSE_list, '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(number_trees, train_MSE_list, '-*', label='m=18 train MSE', alpha=0.9)
ax.set_title('Random Forest: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees[1:], oob_MSE_list[1:], '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(number_trees[1:], train_MSE_list[1:], '-*', label='m=18 train MSE', alpha=0.9)
ax.set_title('Random Forest: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
From this plot it appears that the MSE levels off around 200 Trees. A Random Forest with 18 predictors and 200 Trees will be used as the final Random Forest Model.
clf_rf = RandomForestRegressor(n_estimators=200, max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = 18)
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
oob_predictions = clf_rf.oob_prediction_
oob_MSE = mean_squared_error(y_train,oob_predictions)
oob_MSE
train_predictions = clf_rf.predict(X_train)
test_predictions = clf_rf.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
Using a Random Forest approach with 18 predictors and 200 Trees gives an OOB error on the training data of 41.64 but a test MSE of 37.17, which is slightly less than the Bagging Model MSE.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Random Forest', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
tree_importance_sorted_idx = np.argsort(clf_rf.feature_importances_)
tree_indices = np.arange(0, len(clf_rf.feature_importances_)) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_rf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_bagging.feature_importances_)))
ax1.set_title('Random Forest Variable Importance')
Similarly to the Bagging Variable importance, field position, rusher yards per game and defenders in the box are the most important variables to predicting rushing yards on a given play. All of these top 3 variables do appear to be slightly less important in the Random Forest model compared to the the Bagging model though.
# Determine number of trees using cross validation
number_trees = np.arange(250,5250,250)
number_trees = np.arange(500,5500,500)
number_trees = np.arange(25,525,25)
clf_boosting3
number_trees = np.arange(50,250,50)
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
# train_MSE_list = []
for size in number_trees:
clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4,
random_state=0,
n_estimators=size)
cv_scores = cross_val_score(clf_boosting, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(-cv_scores.mean())
cv_scores_std.append(cv_scores.std())
# train_MSE_list.append(mean_squared_error(y_train, clf_boosting.fit(X_train, y_train.values.ravel()).predict(X_train)))
print(size)
cv_scores_mean
fig, ax = plt.subplots(1,1, figsize=(15,5))
# ax.plot(number_trees, oob_MSE_list, '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(number_trees, cv_scores_mean, '-*', label='Boosting CV MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Even at relatively low number of trees (from 50 to 200 trees) the CV MSE is increasing for Boosting. With this information we will look to tune to lwoer numbers of trees.
cv_scores_list2 = []
cv_scores_std2 = []
cv_scores_mean2 = []
train_MSE_list = []
for size in number_trees:
clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4,
random_state=0,
n_estimators=size)
cv_scores = cross_val_score(clf_boosting, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
cv_scores_list2.append(cv_scores)
cv_scores_mean2.append(-cv_scores.mean())
cv_scores_std2.append(cv_scores.std())
train_MSE_list.append(mean_squared_error(y_train, clf_boosting.fit(X_train, y_train.values.ravel()).predict(X_train)))
print(size)
combined_num_trees = list(np.arange(10,50,5)) + list(np.arange(50,250,50))
combined_cv_scores = list(cv_scores_mean2) + list(cv_scores_mean)
fig, ax = plt.subplots(1,1, figsize=(15,5))
# ax.plot(number_trees, oob_MSE_list, '-o', label='m=18 OOB MSE', alpha=0.9)
ax.plot(combined_num_trees, combined_cv_scores, '-*', label='Boosting CV MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees, cv_scores_mean2, '-o', label='CV MSE', alpha=0.9)
ax.plot(number_trees, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
Looking at this plot of MSE vs. Number of Trees for Boosting (with a shrinkage parameter of 0.1), it appears 25 Trees is a good amount for boosting based on the cross validation MSE. It's interesting to note that the over-fitting due to a higher number of trees is prevalent, shown by the decreasing Train MSE with increasing CV MSE, which isn't the case for bagging and random forests.
lamda = np.arange(0.001,0.1,(0.1-0.001)/20).tolist()
lamda.append(0.1)
cv_scores_list_lamda = []
cv_scores_std_lamda = []
cv_scores_mean_lamda = []
train_MSE_list = []
for learning_rate in lamda:
clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4,
random_state=0,
n_estimators=25, learning_rate=learning_rate)
cv_scores = cross_val_score(clf_boosting, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
cv_scores_list_lamda.append(cv_scores)
cv_scores_mean_lamda.append(-cv_scores.mean())
cv_scores_std_lamda.append(cv_scores.std())
train_MSE_list.append(mean_squared_error(y_train, clf_boosting.fit(X_train, y_train.values.ravel()).predict(X_train)))
print(learning_rate)
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(lamda, cv_scores_mean_lamda, '-o', label='CV MSE', alpha=0.9)
ax.plot(lamda, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Boosting: MSE vs. Shrinkage Parameter Lambda', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
This plot of MSE vs. shrinkage parameter shows that a value of lambda=0.07 is where CV MSE is minimized for the Boosting model. A Boosting Model with 30 trees and a shrinkage parameter of 0.07 will be used in the final Boosting Model.
clf_boosting = GradientBoostingRegressor(max_leaf_nodes = 10, max_depth=4,
random_state=0,
n_estimators=25, learning_rate=0.07)
clf_boosting = clf_boosting.fit(X_train,y_train.values.ravel())
train_predictions = clf_boosting.predict(X_train)
test_predictions = clf_boosting.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
Using a Boostingg approach with 25 trees and a shrinkage parameter of 0.07 gives a training MSE of 41.1 and a test MSE of 37.09. This values is slightly lower than the values found for Bagging and Random Forest.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Boosting', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
tree_importance_sorted_idx = np.argsort(clf_boosting.feature_importances_)
tree_indices = np.arange(0, len(clf_boosting.feature_importances_)) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_boosting.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_boosting.feature_importances_)))
The variable importance chart for Boosting shows more variables are insignificant to the response. Field postition, rushers yards per game and defenders in the box have the most importance to the response.
clf_linear = LinearRegression()
clf_linear = clf_linear.fit(X_train,y_train)
train_predictions = clf_linear.predict(X_train)
test_predictions = clf_linear.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
regression_models_df = regression_models_df.append({'regression_models':'Multi_Linear_Regression', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
def fit_linear_reg(X,Y):
#Fit linear regression model and return RSS and R squared values
model_k = LinearRegression(fit_intercept = True)
model_k.fit(X,Y)
RSS = mean_squared_error(Y,model_k.predict(X)) * len(Y)
R_squared = model_k.score(X,Y)
return RSS, R_squared
remaining_features = list(X_train.columns.values)
features = []
RSS_list, R_squared_list = [np.inf], [np.inf] #Due to 1 indexing of the loop...
features_list = dict()
k = 30
for i in range(1,k+1):
best_RSS = np.inf
for combo in itertools.combinations(remaining_features,1):
RSS = fit_linear_reg(X_train[list(combo) + features],y_train) #Store temp result
if RSS[0] < best_RSS:
best_RSS = RSS[0]
best_R_squared = RSS[1]
best_feature = combo[0]
#Updating variables for next loop
features.append(best_feature)
remaining_features.remove(best_feature)
#Saving values for plotting
RSS_list.append(best_RSS)
R_squared_list.append(best_R_squared)
features_list[i] = features.copy()
df1 = pd.concat([pd.DataFrame({'features':features_list}),pd.DataFrame({'RSS':RSS_list, 'R_squared': R_squared_list})], axis=1, join='inner')
df1['numb_features'] = df1.index
# Compute adjusted R_squared
m = len(y_train)
p = 31
hat_sigma_squared = (1/(m - p -1)) * min(df1['RSS'])
df1['R_squared_adj'] = 1 - ( (1 - df1['R_squared'])*(m-1)/(m-df1['numb_features'] -1))
df1.sort_values('R_squared_adj',ascending=False)
From the forward stepwise selection, 17 variables were selected based on the adjusted R^2 value for that model.
best_subset = df1.loc[17]['features']
best_subset
clf_linear = LinearRegression()
clf_linear = clf_linear.fit(X_train[best_subset],y_train)
train_predictions = clf_linear.predict(X_train[best_subset])
test_predictions = clf_linear.predict(X_test[best_subset])
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
The forward subset selection gave a training MSE of 41.7 and a test MSE of 37.2. Surprisingly, this was a slightly higher test MSE than the test MSE for the Linear Regression with all predictors. This could be due to the training and test split or the fact that the forward stepwise subset selection was determined based on adjusted r^2.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Forward_Stepwise_Subset', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
remaining_features = list(X_train.columns.values)
features = []
RSS_list, R_squared_list = [], [] #Due to 1 indexing of the loop...
features_list = dict()
num_features = []
for j in range(1,k+1):
# for i in range(k,0,-1):
i = 31-j
print(i)
best_RSS = np.inf
for combo in itertools.combinations(remaining_features,i):
RSS = fit_linear_reg(X_train[list(combo)],y_train) #Store temp result
if RSS[0] < best_RSS:
best_RSS = RSS[0]
best_R_squared = RSS[1]
best_combo = combo
#Updating variables for next loop
remaining_features = best_combo
#Saving values for plotting
RSS_list.append(best_RSS)
R_squared_list.append(best_R_squared)
features_list[j-1] = best_combo
num_features.append(i)
df_back = pd.concat([pd.DataFrame({'features':features_list}),pd.DataFrame({'numb_features':num_features, 'RSS':RSS_list, 'R_squared': R_squared_list})], axis=1, join='inner')
df_back
m = len(y_train)
p = 31
hat_sigma_squared = (1/(m - p -1)) * min(df_back['RSS'])
df_back['R_squared_adj'] = 1 - ( (1 - df_back['R_squared'])*(m-1)/(m-df_back['numb_features'] -1))
df_back.sort_values('R_squared_adj',ascending=False)
From the backward stepwise selection, 17 variables were selected based on the adjusted R^2 value for that model.
best_subset = np.array(df_back.loc[13]['features'])
clf_backward = LinearRegression()
clf_backward = clf_linear.fit(X_train[best_subset],y_train)
train_predictions = clf_backward.predict(X_train[best_subset])
test_predictions = clf_backward.predict(X_test[best_subset])
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
The backward subset selection gave similar results to the forward subset selection.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Backward_Stepwise_Subset', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
reg = RidgeCV(alphas=np.logspace(-6, 6, 13),
cv=5)
ridge_model = reg.fit(X_train,y_train)
ridge_model.alpha_
lamda_cv = ridge_model.alpha_
lamda_cv
alphas=np.logspace(-6, 6, 13)
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for alpha in alphas:
clf_ridge = Ridge(alpha=alpha)
cv_scores = cross_val_score(clf_ridge, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(-cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_MSE_list.append(mean_squared_error(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Ridge Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas[:-3], cv_scores_mean[:-3], '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas[:-3], train_MSE_list[:-3], '-*', label='Train MSE', alpha=0.9)
ax.set_title('Ridge Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
cv_scores_mean
From cross validation, the tuning parameter is chosen to be 100.
clf_ridge = Ridge(alpha=100)
clf_ridge = clf_ridge.fit(X_train,y_train)
train_predictions = clf_ridge.predict(X_train)
test_predictions = clf_ridge.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
For a ridge regression model with a tuning parameter of 100, the training MSE was calculated to be 41.73, with a cross validation (on training data) of 41.87. The test MSE was calculated to be 37.177, which is less than all of the other linear regression models thus far.
importance_sorted_idx = np.argsort(clf_ridge.coef_[0])
tree_indices = np.arange(0, len(clf_ridge.coef_[0])) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_ridge.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
From this plot you can see certain parameters coefficients are much lower than others, as the Ridge Regression tuning parameter is pushing them to zero. The highest coefficient is if the runner is a wide receiver and then rushers yards per game. The largest negative correlations are field position and defenders in the box.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Ridge_Regression', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
reg = LassoCV(alphas=np.logspace(-6, 6, 13),
cv=5)
lasso_model = reg.fit(X_train,y_train.values.ravel())
lasso_model.alpha_
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for alpha in alphas:
clf_lasso = Lasso(alpha=alpha)
cv_scores = cross_val_score(clf_lasso, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(-cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_MSE_list.append(mean_squared_error(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas, train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Lasso Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas[:-3], cv_scores_mean[:-3], '-o', label='CV MSE', alpha=0.9)
ax.plot(alphas[:-3], train_MSE_list[:-3], '-*', label='Train MSE', alpha=0.9)
ax.set_title('Lasso Regression: MSE vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.set_xscale('log')
ax.legend()
From cross-validation, the tuning parameter is selected to be 0.01.
clf_lasso = Lasso(alpha=0.01)
clf_lasso = clf_lasso.fit(X_train,y_train)
min(cv_scores_mean)
train_predictions = clf_lasso.predict(X_train)
test_predictions = clf_lasso.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
For a lasso regression model with a tuning parameter of 0.01, the training MSE was calculated to be 41.74, with a cross validation (on training data) of 41.86. The test MSE was calculated to be 37.167, which is less than all of the other linear regression models thus far, including Ridge Regression.
importance_sorted_idx = np.argsort(clf_lasso.coef_)
tree_indices = np.arange(0, len(clf_lasso.coef_)) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_lasso.coef_[importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_lasso.coef_)))
From this plot you can see certain parameters coefficients are pushed to zero by the lasso regression, including Wildcat, OL, defense score, shotgun, down and others. The highest coefficient is if the runner is a wide receiver and then rushers yards per game. The largest negative correlations are field position and defenders in the box.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Lasso_Regression', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
pls = PLSRegression(n_components=10)
r2_values = []
regr = LinearRegression()
n = len(X_train)
# Null model (just intercept, no principal components in regression)
score = cross_val_score(regr, np.ones((n,1)), y_train, cv=5,scoring='r2').mean()
r2_values.append(score)
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_MSE_list = []
for i in np.arange(1,31):
clf_pls = PLSRegression(n_components=i)
cv_scores = cross_val_score(clf_pls, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(-cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_MSE_list.append(mean_squared_error(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(np.arange(1,31), cv_scores_mean, '-o', label='CV MSE', alpha=0.9)
ax.plot(np.arange(1,31), train_MSE_list, '-*', label='Train MSE', alpha=0.9)
ax.set_title('Partial Least Squares: MSE vs. Principal Components', fontsize=16)
ax.set_xlabel('Number of Principal Components', fontsize=14)
ax.set_ylabel('MSE', fontsize=14)
ax.legend()
cv_scores_mean[3]
From Cross-Validation, It appears 4 Principal Components is sufficient for the Partial Least Squares Analysis.
clf_pls = PLSRegression(n_components=4)
clf_pls = clf_pls.fit(X_train,y_train)
train_predictions = clf_pls.predict(X_train)
test_predictions = clf_pls.predict(X_test)
train_MSE = mean_squared_error(y_train,train_predictions)
test_MSE = mean_squared_error(y_test, test_predictions)
train_MSE
test_MSE
Using a Partial Least Squares Model with 4 components, the cross-validation (on the training data) MSE was calculated to be 41.9 and the full training MSE was 41.7. The test set MSE was 37.2, which is higher than all the other regression models thus far.
backup = copy.deepcopy(regression_models_df)
regression_models_df = regression_models_df.append({'regression_models':'Partial_Least_Squares', 'regression_train_MSE':train_MSE, 'regression_test_MSE':test_MSE},ignore_index=True)
regression_models_df
We have now analyzed 11 different regressions (in addition to the null model) for determining rushing yards on a given NFL play based on 30 predictors. To compare all of these models against one another, the data was split into training and test data. Then each model was created from the training data, with the final model comparisons to be done using the test set Mean Squared Error. Clearly this data and regression is highly complex and tough to represent using linear models. This makes sense based on the diagnostic plots generated. None of the models particularly stand out from the null model, which makes sense based on the complexity and nature of the problem. However, certiain models did achieve better results than others, particularly Boosting. Even though it is a tree-based model, boosting showed the lowest test MSE compared to all of the other models, including ridge, lasso and Partial Least Squares among others. All of the models are better than the null model, which shows that these variables may be a good starting point for future more complex analysis.
In this regression analysis we attempted to determine the number of yards a running play would generate based on numerous predictors, including the yards per game of the rusher, the position of the rusher, the offensive and defensive formations and where the ball is located on the field, among others. First, data was processed to get all of this information from a data set provided with information on every running play from 2017-2019. Collinearity was investigated, in which it was determined that several predictors should be taken out.
Not surprisingly after the diagnostics earlier, the models struggled to correctly determine the yards an NFL rushing play would generate based on 30 predictor variables. The data was right-skewed due to the seldom long runs that happen. Perhaps above all else this was a good learning experience in the importance of the diagnostics and assumptions of linear models. However, it did show certain interesting information such as the runner being a wide receiver, distance to go (from first down marker) and being the home team are positively correlated to the rushing yards gained on a play. Also, certain predictors show a negative correlation to rushing yards on a play, including field position, number of defenders in the box, number of Defensive Linemen and Linebackers, if the quarterback is the rusher and most surprisingly if the field is played at a retractable roof stadium with the roof open. Other small correlations were the field being outdorrs and if the play is going to the left show a slight positive orrelation to rushing yards.
Though the accuracy of these models may not be at the point where coaches and owners (or fantasy sports fans and gamblers) would trust the model to determine with great accuracy the amount of yards a rushing play would generate, it does provide certain insight into the factors that may lead to a rushing play gaining more yards.
In this classification problem, I will use the same data set as above to determine whether a rushing play will gain a first down or not based on the predictor variables.
runners.head()
runners['FirstDown'] = list(runners.apply(lambda x: 1 if (x['Yards'] >= x['Distance']) else 0, axis=1))
runners['FirstDown'].value_counts()
The same predictors will be used in the classification model as in the regression model above.
y_train = runners[['FirstDown']].iloc[X_train.index]
y_test = runners[['FirstDown']].iloc[X_test.index]
y_train
classification_models = []
classification_train_ER = []
classification_test_ER = []
The null model is defined as the average rushing yards over the entire data set.
y_train.FirstDown.mode()[0]
null_train_predictions = [y_train['FirstDown'].mode()[0]] * len(X_train.index)
null_test_predictions = [y_train['FirstDown'].mode()[0]] * len(X_test.index)
train_ER = 1 - accuracy_score(y_train,null_train_predictions)
test_ER = 1- accuracy_score(y_test, null_test_predictions)
classification_models.append('Null')
classification_train_ER.append(train_ER)
classification_test_ER.append(test_ER)
train_ER
test_ER
print(classification_report(y_test,null_test_predictions))
classification_models_df = pd.DataFrame({'classification_models': classification_models,'classification_train_ER': classification_train_ER, 'classification_test_ER':classification_test_ER})
classification_models_df
classification_tree_clf = DecisionTreeClassifier(max_leaf_nodes=10)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
train_predictions = classification_tree_clf.predict(X_train)
test_predictions = classification_tree_clf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Decision_Tree','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The Training set classification error rate is calculated to be 0.175 while the test error rate is calculated to be 0.178, these are both improvements over the null model.
dot_data = tree.export_graphviz(classification_tree_clf, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True,
class_names=['No', 'Yes'])
graph = graphviz.Source(dot_data)
graph
From the Full decision tree above, a few important predictors can be noted. The first and second decisions are based on Distance, which makes since because Distance is the variable for distance to first down. So the less yards to the first down makes sense for it to be higher probability of converting a first down. Other important variables are Defenders in the box and field position (YardLine_refactor).
tree_size = range(2,11)
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for size in tree_size:
tree_model = DecisionTreeClassifier(random_state=0, max_leaf_nodes=size)
cv_scores = cross_val_score(tree_model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(1 - cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_ER_list.append(1 - accuracy_score(y_train,tree_model.fit(X_train, y_train.values.ravel()).predict(X_train)))
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(tree_size, cv_scores_mean, '-o', label='CV Error Rate', alpha=0.9)
ax.plot(tree_size, train_ER_list, '-*', label='Train Error Rate', alpha=0.9)
ax.set_title('Accuracy vs. Terminal Nodes', fontsize=16)
ax.set_xlabel('Terminal Nodes', fontsize=14)
ax.set_ylabel('Classification Rate', fontsize=14)
ax.legend()
From Cross Validation, the number of terminal nodes is selected to be 8.
classification_tree_clf = DecisionTreeClassifier(max_leaf_nodes=8)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
train_predictions = classification_tree_clf.predict(X_train)
test_predictions = classification_tree_clf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Pruned_Tree','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The Training set classification error rate is calculated to be 0.175 while the test error rate is calculated to be 0.178, these are both exactly the same as the full decision tree. This is due to the fact that even though the cross validation error rate was less for 8 terminal nodes, when rebuilding on the full training set, it is essentiall the exact same tree as the original full 10 terminal node decision tree. The gini coefficients and decisions are slightly different but the terminal nodes still have the same essentially the same data. Notice that the last splits on each branch all still hold the same classification on each node, so there is no change in the actual values obtained from evaluating the seperate models. See below:
dot_data = tree.export_graphviz(classification_tree_clf, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True,
class_names=['No', 'Yes'])
graph = graphviz.Source(dot_data)
graph
classification_tree_clf = DecisionTreeClassifier(max_leaf_nodes=10)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
path = classification_tree_clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
alphas = np.linspace(0,max(ccp_alphas[:-1]),20)
alphas
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for alpha in alphas:
tree_model = DecisionTreeClassifier(random_state=0, max_leaf_nodes=10,ccp_alpha=alpha)
cv_scores = cross_val_score(tree_model, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(1 - cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_ER_list.append(1 - accuracy_score(y_train,tree_model.fit(X_train, y_train.values.ravel()).predict(X_train)))
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV Error Rate', alpha=0.9)
ax.plot(alphas, train_ER_list, '-*', label='Train Error Rate', alpha=0.9)
ax.set_title('Accuracy vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Tuning Parameter alpha', fontsize=14)
ax.set_ylabel('Classification Rate', fontsize=14)
ax.legend()
alphas[3]
From the Cross validation on Tuning parameter, a tuning parameter of 0.00066 will be used for the Cost Complexity Pruning Classification Model.
classification_tree_clf = DecisionTreeClassifier(ccp_alpha=0.00066)
classification_tree_clf = classification_tree_clf.fit(X_train, y_train)
train_predictions = classification_tree_clf.predict(X_train)
test_predictions = classification_tree_clf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Cost_Complexity_Pruned_Tree','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The Training set classification error rate is calculated to be 0.175 while the test error rate is calculated to be 0.178, these are both exactly the same as the full decision tree (and the previously pruned tree on number of terminal nodes). This is due to the fact that even though the cross validation error rate was less for a tuning parameter of 0.00066, when rebuilding on the full training set, it is essentiall the exact same tree as the original full 10 terminal node decision tree. The gini coefficients and decisions are slightly different but the terminal nodes still have the same essentially the same data. See below:
dot_data = tree.export_graphviz(classification_tree_clf, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True,
class_names=['No', 'Yes'])
graph = graphviz.Source(dot_data)
graph
clf_bagging = RandomForestClassifier(max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = len(X_train.columns))
clf_bagging = clf_bagging.fit(X_train,y_train.values.ravel())
oob_ER = 1 - clf_bagging.oob_score_
oob_ER
train_predictions = clf_bagging.predict(X_train)
test_predictions = clf_bagging.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Bagging','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
A bagging approach on the classification tree led to an out of bag error of 0.172 and a training error rate of 0.171. When this model was applied to the test set an error rate of 0.174 was obtained, which is smaller than the full decision tree and pruned trees.
clf_bagging.feature_importances_
tree_importance_sorted_idx = np.argsort(clf_bagging.feature_importances_)
tree_indices = np.arange(0, len(clf_bagging.feature_importances_)) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_bagging.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_bagging.feature_importances_)))
ax1.set_title('Bagging Variable Importance')
The variable importance on the bagging model shows a very interesting result. Distance to first down is by far the most important predictor to determining if the rush will obtain a first down. Other slightly important variables are the field position (YardLine_refactor), if the runner is a wide receiver and the amount of defenders in the box.
len(X_train.columns)
np.sqrt(len(X_train.columns))
num_predictors = range(1,len(X_train.columns)+1)
oob_ER_list = []
train_ER_list = []
for num in num_predictors:
clf_rf = RandomForestClassifier(max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = num)
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
oob_ER = 1 - clf_rf.oob_score_
oob_ER_list.append(oob_ER)
train_predictions = clf_rf.predict(X_train)
train_ER = 1- accuracy_score(y_train,train_predictions)
train_ER_list.append(train_ER)
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(num_predictors, oob_ER_list, '-o', label='OOB ', alpha=0.9)
ax.plot(num_predictors, train_ER_list, '-*', label='train', alpha=0.9)
ax.set_title('Random Forest: Classification Error Rate vs. Number of Predictors', fontsize=16)
ax.set_xlabel('Number of Predictors', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.legend()
From this plot it appears the the out of bag classification error rate reaches a minimum at 15 predictors.
number_trees = np.arange(25,525,25)
oob_ER_list = []
train_ER_list = []
for num in number_trees:
clf_rf = RandomForestClassifier(n_estimators=num, max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = 15)
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
oob_ER = 1 - clf_rf.oob_score_
oob_ER_list.append(oob_ER)
train_predictions = clf_rf.predict(X_train)
train_ER = 1- accuracy_score(y_train,train_predictions)
train_ER_list.append(train_ER)
print(num)
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(number_trees, oob_ER_list, '-o', label='OOB ', alpha=0.9)
ax.plot(number_trees, train_ER_list, '-*', label='train', alpha=0.9)
ax.set_title('Random Forest: Classification Error Rate vs. Number of Trees', fontsize=16)
ax.set_xlabel('Number of Trees', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.legend()
From this plot it appears that the out of bag Classification Error Rate reaches a minimum around 100 Trees. A Random Forest with 15 predictors and 100 Trees will be used as the final Random Forest Model.
clf_rf = RandomForestClassifier(n_estimators=100, max_leaf_nodes = 10, oob_score=True,
random_state=0, max_features = 15)
clf_rf = clf_rf.fit(X_train,y_train.values.ravel())
oob_ER = 1 - clf_rf.oob_score_
oob_ER
train_predictions = clf_rf.predict(X_train)
test_predictions = clf_rf.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Random Forest','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The random forest model with 15 predictors and 100 trees had an out of bag classification error rate of 0.172 and a training set error rate of 0.171, slightly smaller than the Bagging Training set classification error rate. However, the test set classification error rate is 0.1748, slightly larger than the test set bagging classification error rate. This shows that there is a slightly higher variance in the random forest approach here than in the bagging approach.
clf_rf.feature_importances_
tree_importance_sorted_idx = np.argsort(clf_rf.feature_importances_)
tree_indices = np.arange(0, len(clf_rf.feature_importances_)) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_rf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X_train.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_rf.feature_importances_)))
ax1.set_title('Random Forest Classifier Variable Importance')
Similar to the bagging classifier, the distance to first down predictor is by far the most important variable for determining if a rush will make a first down. However, unlike in the bagging classifier, down number is the second most important variable to determing if a run will make a first down. Other variables that show slight importance are field position, the binary value for if the rusher is a WR, the number of defenders in the box, the number of wide receivers and the rushers yards per game.
Note that this approach is not advisable for classification. This is just a study to see the effects.
clf_linear = LinearRegression()
clf_linear = clf_linear.fit(X_train,y_train)
train_probabilities = clf_linear.predict(X_train)
test_probabilities = clf_linear.predict(X_test)
train_predictions = np.where(train_probabilities > 0.5, 1, 0)
test_predictions = np.where(test_probabilities > 0.5, 1, 0)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Linear_Regression','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The linear regression approach (typically not advisable for classification) has a training set classification error rate of 0.178 and a test set classification error rate of 0.177. This test set classification error rate is higher than the error rates for bagging and random forest.
clf_linear.coef_
importance_sorted_idx = np.argsort(clf_linear.coef_[0])
tree_indices = np.arange(0, len(clf_linear.coef_[0])) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_linear.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
From the coefficients for each of the predictors in the linear regression model, we find the following information for predicting whether a rush will reach a first down. If the rusher is a wide receiver or fullback the probability of the play gaining a first down is more likely. Also if the play comes from the Wilcat Formation it has a correlation with running plays getting a first down. Finally, as the down increases, the more likely it is that the rush will go for a fist down. However, Distance to go for first down is negatively correlated with whether the run will make a first down. other negative correlations to a run making a first down are if the rusher is a quarterback, if the game is in a retractable roof stadium with the roof open and if the wether is rainy.
clf_logistic = LogisticRegression()
clf_logistic = clf_logistic.fit(X_train,y_train.values.ravel())
train_probabilities = clf_logistic.predict(X_train)
test_probabilities = clf_logistic.predict(X_test)
train_predictions = np.where(train_probabilities > 0.5, 1, 0)
test_predictions = np.where(test_probabilities > 0.5, 1, 0)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Logistic_Regression','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The logistic regression approach gave a training error rate of 0.172 and a test error rate of 0.175. These were both better than the linear regression approach but worse than bagging and random forest results.
clf_logistic.coef_
importance_sorted_idx = np.argsort(clf_logistic.coef_[0])
tree_indices = np.arange(0, len(clf_logistic.coef_[0])) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_logistic.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
The coefficients for the logistic regression model follow similar trends to the coefficients in the linear regression for the most part. f the rusher is a wide receiver or fullback the probability of the play gaining a first down is more likely. Also if the play comes from the Wilcat Formation it has a correlation with running plays getting a first down. Finally, the rushers yards per game has a positive correlation to whether the run will reach a first down. However, Distance to go for first down is negatively correlated with whether the run will make a first down. other negative correlations to a run making a first down are if the rusher is a quarterback, if the game is in a retractable roof stadium with the roof open and if the wether is rainy. The number of defenders in the box has a negative correlation with the first down binary value, meaning that as there are more players in the box on defense the less likely it will be that the run will make a first down.
clf_lda = LinearDiscriminantAnalysis()
clf_lda = clf_lda.fit(X_train,y_train.values.ravel())
train_predictions = clf_lda.predict(X_train)
test_predictions = clf_lda.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Linear_Discriminant_Analysis','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The linear discriminant analysis approach gave a training set error rate of 0.173 and a test set error rate of 0.172. This test error rate is the lowest error rate obtained for the classification model thus far.
importance_sorted_idx = np.argsort(clf_lda.coef_[0])
tree_indices = np.arange(0, len(clf_lda.coef_[0])) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_lda.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
The coefficients for LDA show similar trends to the logistic regression.
clf_qda = QuadraticDiscriminantAnalysis()
clf_qda = clf_qda.fit(X_train,y_train.values.ravel())
train_predictions = clf_qda.predict(X_train)
test_predictions = clf_qda.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Quadratic_Discriminant_Analysis','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The quadratic discriminant analysis approach gave a training set error rate of 0.203 and a test set error rate of 0.205. This test error rate is significantly higher than all of the other models tested and only slightly better than the null model. Perhaps assuming that each class has its own covariance matri is a bad assumption, because LDA, with its common covariance matrix assumption, has lower classification error rate. Also, since QDA is more flexible perhaps it is allowing too much flexibility in the model, although this does not show up as overfitting issue because the training error rate is also higher than the others.
reg = RidgeClassifierCV(alphas=np.logspace(-6, 6, 13),
cv=5)
ridge_model = reg.fit(X_train,y_train.values.ravel())
ridge_model.alpha_
lamda_cv = ridge_model.alpha_
lamda_cv
alphas=np.logspace(-6, 6, 13)
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for alpha in alphas:
clf_ridge = RidgeClassifier(alpha=alpha)
cv_scores = cross_val_score(clf_ridge, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(1- cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_ER_list.append(1- accuracy_score(y_train, clf_ridge.fit(X_train, y_train.values.ravel()).predict(X_train)))
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(alphas, cv_scores_mean, '-o', label='CV ', alpha=0.9)
ax.plot(alphas, train_ER_list, '-*', label='train', alpha=0.9)
ax.set_title('Ridge Classification: Classification Error Rate vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('tuning Parameter', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.set_xscale('log')
ax.legend()
From the above plot, a tuning parameter of 1e-6 will be used in the Ridge Classification Model
clf_ridge = RidgeClassifier(alpha=lamda_cv)
clf_ridge = clf_ridge.fit(X_train,y_train.values.ravel())
train_predictions = clf_ridge.predict(X_train)
test_predictions = clf_ridge.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Ridge_Classification','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The ridge classification gave essentially the same results and model as the linear regression model. The test error rate of 0.177 is higher than most other models except QDA.
importance_sorted_idx = np.argsort(clf_ridge.coef_[0])
tree_indices = np.arange(0, len(clf_ridge.coef_[0])) + 0.5
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.barh(tree_indices,
clf_ridge.coef_[0][importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(clf_ridge.coef_[0])))
K Nearest neighbors is a non-parametric model that will be used for classification. Cross-validation on the training set will be used to determine the number of neighbors for the final model.
cv_scores_list = []
cv_scores_std = []
cv_scores_mean = []
train_ER_list = []
for neighbors in number_neighbors:
clf_knn = KNeighborsClassifier(n_neighbors=neighbors)
cv_scores = cross_val_score(clf_knn, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
cv_scores_list.append(cv_scores)
cv_scores_mean.append(1- cv_scores.mean())
cv_scores_std.append(cv_scores.std())
train_ER_list.append(1- accuracy_score(y_train, clf_knn.fit(X_train, y_train.values.ravel()).predict(X_train)))
print(neighbors)
number_neighbors2 = np.arange(35,55,5)
cv_scores_list2 = []
cv_scores_std2 = []
cv_scores_mean2 = []
train_ER_list2 = []
for neighbors in number_neighbors2:
clf_knn = KNeighborsClassifier(n_neighbors=neighbors)
cv_scores = cross_val_score(clf_knn, X_train, y_train.values.ravel(), cv=5, scoring='accuracy')
cv_scores_list.append(cv_scores)
cv_scores_mean2.append(1- cv_scores.mean())
cv_scores_std2.append(cv_scores.std())
train_ER_list2.append(1- accuracy_score(y_train, clf_knn.fit(X_train, y_train.values.ravel()).predict(X_train)))
print(neighbors)
k_neighbors = list(number_neighbors[:2]) + list(number_neighbors2) + list(number_neighbors[2:])
training_k_neighbors = list(train_ER_list[:2]) + list(train_ER_list2) + list(train_ER_list[2:])
cv_scores_mean_knn = list(cv_scores_mean[:2]) + list(cv_scores_mean2) + list(cv_scores_mean[2:])
fig, ax = plt.subplots(1,1, figsize=(15,5))
ax.plot(k_neighbors, cv_scores_mean_knn, '-o', label='CV ', alpha=0.9)
ax.plot(k_neighbors, training_k_neighbors, '-*', label='train', alpha=0.9)
ax.set_title('KNN Classification: Classification Error Rate vs. Tuning Parameter', fontsize=16)
ax.set_xlabel('Neighbors', fontsize=14)
ax.set_ylabel('Classification Error Rate', fontsize=14)
ax.legend()
From the plot above, it appears the classification error rate reaches a minimum at n=35 neighbors. This will be used for the final knn model.
clf_knn = KNeighborsClassifier(n_neighbors=35)
clf_knn = clf_knn.fit(X_train,y_train.values.ravel())
train_predictions = clf_knn.predict(X_train)
test_predictions = clf_knn.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'KNN_Classification','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The KNN classification with 35 neighbors gave a training set error rate of 0.183 and a test set error rate of 0.190. This is higher than all models other than the QDA and the Null Model.
clf_gnb = GaussianNB()
clf_gnb = clf_gnb.fit(X_train,y_train.values.ravel())
train_predictions = clf_gnb.predict(X_train)
test_predictions = clf_gnb.predict(X_test)
train_ER = 1 - accuracy_score(y_train,train_predictions)
test_ER = 1- accuracy_score(y_test, test_predictions)
train_ER
test_ER
results = confusion_matrix(y_train, train_predictions)
df_cm = pd.DataFrame(results, index=['No','Yes'], columns = ['No','Yes'])
df_cm
print(classification_report(y_test,test_predictions))
backup = copy.deepcopy(classification_models_df)
classification_models_df = classification_models_df.append({'classification_models': 'Gaussian_Naive_Bayes','classification_train_ER': train_ER, 'classification_test_ER':test_ER},ignore_index=True)
classification_models_df
The Gaussian Naive Bayes had a training error rate of 0.208 and test error rate of 0.212. This test error rate is higher than all models, including the Null Model.
After analyzing 12 different classification models (in addition to the null model) for determining whether a rush will